Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.
library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
set.seed(1)
colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
"Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
"Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
"Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
"Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
"Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
"Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
"Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
"Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
"Image_Width","Image_X","Image_Y","Intensity","Length",
"Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
"Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
"Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
"Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")
dd_all <- read_csv("../Class_18C_normalLight/1st_class_2020/Data/libraries_FC100/dd_all.csv", show_col_types = FALSE)
# dd_all$species <- ifelse(dd_all$species=="small_unidentified_possibly_small_digested_algae", "Small_unidentified",dd_all$species)
# table(dd_all$species)
Chlamydomonas <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Chlamydomonas_light_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Cosmarium_light_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Desmodesmus <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Desmodesmus_light_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Monoraphidium_light_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum1_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum2_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_feb22 <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_feb22) <- colnames
# table(dd_all_feb22$species)
Chlamydomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Chlamydomonas_light_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cosmarium_light_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cryptomonas_light_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Desmodesmus_light_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Monoraphidium_light_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum1_light_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum2_light_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220316 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220316) <- colnames
# table(dd_all_20220316$species)
Chlamydomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Chlamydomonas_light_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cosmarium_light_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cryptomonas_light_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Desmodesmus_light_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Monoraphidium_light_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum1_light_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum2_light_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220406 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220406) <- colnames
# table(dd_all_20220406$species)
Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Chlamydomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cosmarium_dark_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Desmodesmus_dark_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Monoraphidium_dark_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum1_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum2_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cryptomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
dd_all_feb22_dark <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2,
Cryptomonas)
colnames(dd_all_feb22_dark) <- colnames
# table(dd_all_feb22_dark$species)
Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Chlamydomonas_dark_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cosmarium_dark_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cryptomonas_dark_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Desmodesmus_dark_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Monoraphidium_dark_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum1_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum2_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220316_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220316_dark) <- colnames
# table(dd_all_20220316_dark$species)
rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)
Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Chlamydomonas_dark_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cosmarium_dark_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cryptomonas_dark_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Desmodesmus_dark_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Monoraphidium_dark_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum1_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum2_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220406_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220406_dark) <- colnames
# table(dd_all_20220406_dark$species)
rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)
dd_all$data <- "Late 2020"
dd_all_feb22$data <- "20220201"
dd_all_20220316$data <- "20220316"
dd_all_20220406$data <- "20220406"
dd <- rbind(dd_all,dd_all_feb22,dd_all_20220316,dd_all_20220406)
dd_all_feb22_dark$data <- "20220201"
dd_all_20220316_dark$data <- "20220316"
dd_all_20220406_dark$data <- "20220406"
dd$light <- "30%"
dd_all_feb22_dark$light <- "18%"
dd_all_20220316_dark$light <- "6%"
dd_all_20220406_dark$light <- "1%"
dd <- rbind(dd,dd_all_feb22_dark,dd_all_20220316_dark,dd_all_20220406_dark)
# table(dd$species, dd$data, dd$light)
If an individual is an outlier in more than 4 traits it gets excluded (about 6% gets excluded). Outlier based on boxplot definition.
dd$id <- 1:nrow(dd)
dd_long <- dd %>%
dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
Average_Green,Average_Red,Circle_Fit,Circularity,
Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
Symmetry,Transparency,Width, id, data, light) %>%
pivot_longer(cols=-c(id,species, data, light), names_to="variable") %>%
dplyr::group_by(variable,species,data,light) %>%
mutate(iqr = IQR(value),
first_quartile = quantile(value, probs = 0.25),
third_quartile = quantile(value, probs = 0.75),
outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))
outliers <- dd_long %>%
dplyr::filter(outlier==T) %>%
dplyr::group_by(species, id,data,light) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(n>4)
## `summarise()` has grouped output by 'species', 'id', 'data'. You can override using the `.groups` argument.
# table(outliers$species)
# nrow(outliers)/nrow(dd)
dd_filtered <- dd %>%
dplyr::filter(!is.element(id,outliers$id))
dd$removed_outliers <- F
dd_filtered$removed_outliers <- T
dd_comparison <- rbind(dd,dd_filtered)
# dd_comparison %>%
# ggplot(aes(log10(Area_ABD), col=removed_outliers))+
# geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
# geom_boxplot(outlier.alpha = 0.3) +
# theme_bw() +
# facet_wrap(species~interaction(light,data), scales = "free_y") +
# geom_vline(xintercept = 1, col="black")
#
# dd_filtered %>%
# ggplot(aes(log10(Area_ABD)))+
# geom_density(aes(col=species))
We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.
dd <- dd %>%
mutate(data.light = interaction(data,light, drop = T),
data.light.species=interaction(data,light,species, drop = T))
print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##
## airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
## 20220406.1% 0 2035 0 0
## 20220201.18% 0 3023 0 0
## 20220201.30% 0 6004 0 0
## 20220316.30% 0 6463 0 0
## 20220406.30% 0 4391 0 0
## Late 2020.30% 112 1099 359 475
## 20220316.6% 0 4410 0 0
##
## Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
## 20220406.1% 0 0 93 2162 0
## 20220201.18% 0 0 176 2987 0
## 20220201.30% 0 0 1103 0 0
## 20220316.30% 0 0 328 3334 0
## 20220406.30% 0 0 429 2901 0
## Late 2020.30% 101 264 341 781 339
## 20220316.6% 0 0 164 2239 0
##
## Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
## 20220406.1% 874 0 0 0
## 20220201.18% 1113 0 0 0
## 20220201.30% 5122 0 0 0
## 20220316.30% 4068 0 0 0
## 20220406.30% 3655 0 0 0
## Late 2020.30% 1051 160 154 398
## 20220316.6% 2573 0 0 0
##
## Loxocephallus Monoraphidium OtherCiliate Small_unidentified
## 20220406.1% 0 578 0 0
## 20220201.18% 0 1916 0 0
## 20220201.30% 0 746 0 0
## 20220316.30% 0 1062 0 0
## 20220406.30% 0 1026 0 0
## Late 2020.30% 224 1489 164 4642
## 20220316.6% 0 855 0 0
##
## Staurastrum1 Staurastrum2 Tetrahymena
## 20220406.1% 212 91 0
## 20220201.18% 111 124 0
## 20220201.30% 589 126 0
## 20220316.30% 244 52 0
## 20220406.30% 604 89 0
## Late 2020.30% 510 75 108
## 20220316.6% 225 94 0
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
## airbubbles Chlamydomonas ChlamydomonasClumps
## 112 27425 359
## Coleps_irchel Colpidium ColpidiumVacuoles
## 475 101 264
## Cosmarium Cryptomonas Debris
## 2634 14404 339
## Desmodesmus Dexiostoma DigestedAlgae
## 18456 160 154
## DividingChlamydomonas Loxocephallus Monoraphidium
## 398 224 7672
## OtherCiliate Small_unidentified Staurastrum1
## 164 4642 2495
## Staurastrum2 Tetrahymena
## 651 108
Clearly, the amount of individuals per class varies a lot… I think the most limiting factor is Staurastrum2, i.e. an algae we are interested in but of which only have rather few individuals. For now at least I try to take 2 * Staurastrum2 as the number of individuals per species to be included (if a species/class has less than that all individuals are included).
Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.
n <- colsums["Staurastrum2"]*2
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=1302"
num_ind_per_class <- dd %>% group_by(species) %>%
summarize(num_class = length(unique(data.light.species)),
num_ind_per_class = floor(n/num_class)) %>%
select(species,num_ind_per_class)
num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species
set.seed(53)
split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
species <- unique(x$species)
x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)
print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##
## airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
## 20220406.1% 0 186 0 0
## 20220201.18% 0 186 0 0
## 20220201.30% 0 186 0 0
## 20220316.30% 0 186 0 0
## 20220406.30% 0 186 0 0
## Late 2020.30% 112 186 359 475
## 20220316.6% 0 186 0 0
##
## Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
## 20220406.1% 0 0 93 217 0
## 20220201.18% 0 0 176 217 0
## 20220201.30% 0 0 186 0 0
## 20220316.30% 0 0 186 217 0
## 20220406.30% 0 0 186 217 0
## Late 2020.30% 101 264 186 217 339
## 20220316.6% 0 0 164 217 0
##
## Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
## 20220406.1% 186 0 0 0
## 20220201.18% 186 0 0 0
## 20220201.30% 186 0 0 0
## 20220316.30% 186 0 0 0
## 20220406.30% 186 0 0 0
## Late 2020.30% 186 160 154 398
## 20220316.6% 186 0 0 0
##
## Loxocephallus Monoraphidium OtherCiliate Small_unidentified
## 20220406.1% 0 186 0 0
## 20220201.18% 0 186 0 0
## 20220201.30% 0 186 0 0
## 20220316.30% 0 186 0 0
## 20220406.30% 0 186 0 0
## Late 2020.30% 224 186 164 1302
## 20220316.6% 0 186 0 0
##
## Staurastrum1 Staurastrum2 Tetrahymena
## 20220406.1% 186 91 0
## 20220201.18% 111 124 0
## 20220201.30% 186 126 0
## 20220316.30% 186 52 0
## 20220406.30% 186 89 0
## Late 2020.30% 186 75 108
## 20220316.6% 186 94 0
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
## airbubbles Chlamydomonas ChlamydomonasClumps
## 112 1302 359
## Coleps_irchel Colpidium ColpidiumVacuoles
## 475 101 264
## Cosmarium Cryptomonas Debris
## 1177 1302 339
## Desmodesmus Dexiostoma DigestedAlgae
## 1302 160 154
## DividingChlamydomonas Loxocephallus Monoraphidium
## 398 224 1302
## OtherCiliate Small_unidentified Staurastrum1
## 164 1302 1227
## Staurastrum2 Tetrahymena
## 651 108
trainingdata <- trainingdata[complete.cases(trainingdata),]
# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
p = 0.75, # percentage of data as training
list = FALSE)
testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- comp_name
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
Average_Blue + Average_Green + Average_Red + Circle_Fit +
Circularity + Compactness +
Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
Width
set.seed(8765)
classifiers_18c <- list()
classifiers_18c_plots <- list()
classifiers_18c_assess_plots <- list()
for(i in 1:length(compositions_list)){
if("Colpidium" %in% compositions_list[[i]]) {
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
sub_testdata <- testdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
} else {
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
"DigestedAlgae","DividingChlamydomonas"))
sub_testdata <- testdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
"DigestedAlgae","DividingChlamydomonas"))
}
sub_trainingdata$species <- factor(sub_trainingdata$species)
sub_testdata$species <- factor(sub_testdata$species)
# split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
# subsamples <- lapply(split_up, function(x) {
# x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
# sub_trainingdata <- do.call("rbind", subsamples)
# # A stratified random split of the data
# idx_train <- createDataPartition(sub_trainingdata$species,
# p = 0.7, # percentage of data as training
# list = FALSE)
#
# sub_testdata <- sub_trainingdata[-idx_train,]
# sub_trainingdata <- sub_trainingdata[idx_train,]
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c[[i]] <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
classifiers_18c_plots[[i]] <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
classifiers_18c_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}
names(classifiers_18c_plots) <- names(classifiers_18c) <-
names(classifiers_18c_assess_plots) <- comp_name
There are mainly 4 measures:
Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)
Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)
Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)
Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)
PPV is the most important for us!
library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] +
plot_layout(widths = c(7,3)))
}
saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_16x_20220316_MergedData.rds")
# cl <- readRDS("classifiers_18c_25x.rds")
# labels <- dd_all %>%
# group_by(species) %>%
# summarise(xPos = max(Area_Filled),
# yPos = max((density(Area_Filled))$y))
#
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))